#  Analysis for "Benefits of Bundling" by Aryal, Murry, Pal, and Palit 
#  AEJ: Micro


# Top Matter ----
rm(list=ls())

if (!require("pacman")) install.packages("pacman")
pacman::p_load(readr, dplyr, tibble, feather, stringr, ggplot2,
               readxl, plm, magrittr, stargazer, maxLik,
               gtsummary, corrr, tidyr, fixest, janitor,
               kableExtra, huxtable, scales, ggthemes, broom, 
               margins, patchwork, rstudioapi)

# Automatically detect where main.R is located
script_dir <- dirname(rstudioapi::getActiveDocumentContext()$path)

# Go up one level from 2_Code to 2_Estimation
computer <- dirname(script_dir)
computer <- paste0(computer, "/")

# Raw data file names
fn_analysis <- paste0(computer, "1_Input/df_analysis.RData")

# Output tables and Figures
dir_savetabs <- paste0(computer, "3_Output/tables/")
dir_savefigs <- paste0(computer, "3_Output/figures/")

setwd(computer)

# Load utility functions
source(paste0(script_dir, "/utils.R"))



# Load Data ----
load(fn_analysis)


# Clean data ----
# We want to describe the market structure before cleaning the data, then 
# present all the price/bandwidth results after cleaning the data. 
# 
# Notes:
# Main things for us to clean
# - There are some crazy price values
# - remove schools who experience a price decrease and order less 
#    bandwidth (and vice versa) (we call these irrational)
# - we need to see schools in both periods
# - For the DiD, we further restrict to same-type of internet


### Clean up ISPs in original dataframe ----
df_analysis_clean_names <- df_analysis %>% mutate(
  vendor = case_when(
    vendor %in% c("Cablevision", "Cablevison") ~ "Cablevision",
    vendor %in% c("Century Link", "CenturyLink", "Centurylink") ~ "CenturyLink",
    vendor %in% c("Comcast", "Comcast and Advanza", "Comcast and FiberTech", 
                  "Comcast and PenTeleData", "Comcast and Verizon") ~ "Comcast",
    vendor %in% c("Fibertech", "FiberTech") ~ "FiberTech",
    vendor %in% c("LIghtpath", "LightPath", "Lightpath") ~ "Lightpath",
    vendor %in% c("Line Systems", "Line Systems, Inc") ~ "Line Systems",
    # vendor %in% c("MetComm.Net", "MetTel") ~ "MetTel",
    vendor %in% c("Monmouth Internet", "Monmouth Telecom") ~ "Monmouth Telecom",
    vendor %in% c("NetCarrier", "Net Access") ~ "NetCarrier",
    # vendor %in% c("Paetec", "Spectrotel") ~ "Spectrotel",
    vendor %in% c("TouchTone", "Touchtone") ~ "TouchTone",
    vendor == "Warwick Valley Telephone" ~ "Warwick Valley Telephone",
    vendor %in% c("XTel", "Xtel") ~ "XO",
    TRUE ~ vendor  # Keep all other vendor names as they are
  )
)



## Create sample for summary stats table ----
# watch out, erate does not exist for ever schools. 

df_analysis_stats <- df_analysis_clean_names %>% 
  mutate(total_mbps = as.numeric(total_mbps), 
         erate_discount = as.numeric(erate_discount),
         monthly_recurring_cost = as.numeric(monthly_recurring_cost),
         total_mbps_old = as.numeric(total_mbps_old)) %>%   
  mutate(expenditure_yearly = monthly_recurring_cost * 12) %>%  # new line
  arrange(district_organization_name, D) %>% 
  group_by(district_organization_name) %>% 
  mutate(numobs=n()) %>% 
  mutate(fiber = ifelse(type %in% c("FiOS", "Ethernet"), 1, 0),
         coaxial = ifelse(type %in% c("Cable Modem"), 1, 0),
         other = ifelse(type %in% c("Not Reported", "Wireless", "Dedicated access T-1 or DS-3", "DSL"), 1, 0)) %>%
  mutate(type_agg = case_when(
    fiber == 1 ~ "Fiber",
    coaxial == 1 ~ "Coax",
    other == 1 ~ "Other"
  )) %>%  # new line
  mutate(price_change = case_when(price[2]==price[1] ~ 'unchanged',
                                  price[2]<price[1] ~ 'decrease',
                                  price[2]>price[1] ~ 'increase')) %>% 
  mutate(bandwidth_change = case_when(total_mbps[2]==total_mbps[1] ~ 'unchanged',
                                      total_mbps[2]<total_mbps[1] ~ 'decrease',
                                      total_mbps[2]>total_mbps[1] ~ 'increase')) %>% 
  mutate(change_transport = type_agg[1] != type_agg[2]) %>%  # new line
  mutate(participant = ifelse(final_disposition %in% c("Participating"),TRUE,FALSE)) %>% 
  mutate(irrational = ((price_change=='decrease' & bandwidth_change=='decrease') | (price_change=='increase' & bandwidth_change=='increase'))) %>% 
  mutate(p99 = quantile(price,0.99, na.rm = TRUE),
         p95 = quantile(price,0.95, na.rm = TRUE)) %>% 
  group_by(district_organization_name) %>% 
  ungroup() %>% 
  filter(price<quantile(price,0.95, na.rm = TRUE)) %>%
  mutate(participant = ifelse(final_disposition == "Participating",TRUE,FALSE)) %>%
  relocate(price,.before=price_old) 


## Define schools that are "passive" -- have same terms two years in a row. ----
df_nonactive <- df_analysis_stats %>%
  group_by(district_organization_name) %>% 
  arrange(district_organization_name,D) %>% 
  mutate(nonactive = ifelse(price[1]==price[2] & vendor[1]==vendor[2],1,0)) %>% 
  select(district_organization_name,D,nonactive) %>% 
  ungroup()

df_analysis_stats <- left_join(df_analysis_stats,df_nonactive,by=c("district_organization_name","D"))

## Mark Cat-D users ---- 
# - We mark any school that does CatD in 2015 as a CatD school.
df_analysis_stats <- df_analysis_stats %>% 
  arrange(district_organization_name,D) %>% 
  group_by(district_organization_name) %>% 
  mutate(CatDSchool = ifelse(category_of_service[1] %in% c("Cat B/D- WAN/Internet","Cat D- Internet"),TRUE,FALSE)) %>%
  ungroup()



# Section 4 ----

## Sample Numbers ----
df_analysis_stats %>%
  filter(D==1) %>% 
  mutate(responded = !initial_disposition == "Unknown") %>%
  count(responded)

df_analysis_stats %>%
  filter(D==1) %>% 
  count(participant)


## Table 2 ----
# Calculate summary stats grouped by D
sumstats_D <- df_analysis_stats %>% 
  group_by(D) %>% 
  calculate_sumstats() %>%
  mutate(participant = "Overall") %>%
  select(D, participant, everything())  # Reorder columns to match sumstats_D_participant

# Calculate summary stats grouped by D and participant
sumstats_D_participant <- df_analysis_stats %>% 
  mutate(fiber = ifelse(type %in% c("FiOS", "Ethernet"), 1, 0),
         coaxial = ifelse(type %in% c("Cable Modem"), 1, 0),
         other = ifelse(type %in% c("Not Reported", "Wireless", "Dedicated access T-1 or DS-3", "DSL"), 1, 0)) %>% 
  group_by(D, participant) %>% 
  calculate_sumstats() %>%
  mutate(participant = as.character(participant))  # Ensure participant is character type

# Combine the two summary stat tables
combined_sumstats <- bind_rows(sumstats_D, sumstats_D_participant)

# Reshape the combined table with custom ordering
tab_reshaped <- combined_sumstats %>%
  pivot_longer(cols = -c(D, participant), 
               names_to = c("stat", "variable"), 
               names_pattern = "(.+)_(.+)") %>%
  pivot_wider(names_from = c(D, stat), 
              values_from = value) %>%
  mutate(
    variable = factor(variable, levels = c("price", "mbps", "isp", "fiber", "coaxial", "other", "catD")),
    participant = factor(participant, levels = c("Overall", "TRUE", "FALSE"))
  ) %>%
  arrange(variable, participant)

# Function to format column names
format_colnames <- function(x) {
  x %>%
    str_remove("^0_|^1_") %>%
    str_to_title()
}

# Function to format numbers to 2 decimal places
format_number <- function(x) {
  ifelse(is.na(x), "--", sprintf("%.2f", x))
}


# Prepare the data from tab_reshaped
table_data <- tab_reshaped %>%
  select(variable, participant, 
         `0_mean`, `0_median`, `0_sd`, 
         `1_mean`, `1_median`, `1_sd`) %>%
  mutate(across(c(`0_mean`, `0_median`, `0_sd`, `1_mean`, `1_median`, `1_sd`), format_number)) %>%
  mutate(
    variable = case_when(
      participant == "Overall" ~ case_when(
        variable == "price" ~ "Price",
        variable == "mbps" ~ "Bandwidth",
        variable == "isp" ~ "Num ISPs",
        variable == "fiber" ~ "Fiber",
        variable == "coaxial" ~ "Coaxial",
        variable == "other" ~ "Other",
        variable == "catD" ~ "Category D",
        TRUE ~ variable
      ),
      participant == "TRUE" ~ "--Participant",
      participant == "FALSE" ~ "--Non-participant"
    ),
    row_type = case_when(
      participant == "Overall" ~ "main",
      TRUE ~ "sub"
    )
  ) 

# Function to add line space every three rows
add_line_space <- function(latex_code) {
  lines <- str_split(latex_code, "\n")[[1]]
  data_start <- which(str_detect(lines, "^\\\\toprule"))[1] + 8  # Start after header
  data_end <- which(str_detect(lines, "^\\\\bottomrule"))[1] - 1  # End before bottom rule
  
  for (i in seq(data_start, data_end, by = 4)) {
    if (i < data_end) {
      lines <- c(lines[1:i], "\\addlinespace[0.5em]", lines[(i+1):length(lines)])
      data_end <- data_end + 1  # Adjust end position for inserted line
    }
  }
  
  paste(lines, collapse = "\n")
}


# Create the LaTeX table
latex_table <- table_data %>%
  select(-row_type, -participant) %>%
  kbl(format = "latex", booktabs = TRUE, longtable = FALSE,
      col.names = c("Outcome", "Mean", "Median", "SD", "Mean", "Median", "SD"),
      align = c("l", rep("c", 6)),
      caption = "Summary Statistics",
      label = "sumstats",
      linesep = "",
      format.args = list(big.mark = ",")) %>%  # Optional: formats large numbers with commas
  kable_styling(latex_options = c("hold_position", "scale_down"),
                full_width = FALSE) %>%
  # kable_styling(latex_options = c("hold_position"),
  #               full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Pre Consortium (2014)" = 3, "Post Consortium (2015)" = 3)) %>%
  column_spec(1, width = "5cm") %>%
  row_spec(0, bold = TRUE) %>%
  row_spec(which(grepl("^[A-Z]", table_data$variable)), bold = TRUE) %>%
  gsub("\\centering","\\small\\\\centering",.) %>% 
  gsub("NA", "--", .) %>%
  gsub("\\\\midrule", "\\\\midrule\n\\\\addlinespace[0.3cm]", .) %>% 
  add_line_space()

cat(latex_table)

# Write the LaTeX code to a file
writeLines(latex_table, paste0(dir_savetabs,"tab_sumstats.tex"))

### Hardcode this to footnote in table 2. 
df_analysis_stats %>% count(D)
df_analysis_stats %>% filter(D==1) %>% count(participant)
df_analysis_stats %>% filter(D==1) %>% count(initial_disposition,participant)



# Section 5 ----

## Table 3 ----

### Run regressions ----
did_price <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>% 
  lm(price ~ D + Time + D*Time -1, data=.)

did_q <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(total_mbps ~ D + Time + D*Time -1, data=.)

did_price_control <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(price ~ D + Time + D*Time -1+ num_isp + i(vendor) + i(region) + i(type_agg) + i(organization_type), data=.)

did_q_control <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(total_mbps ~ D + Time + D*Time -1 + i(vendor) + i(region) + i(type_agg) + i(organization_type), data=.)

### Make the table ----
names(did_price$coefficients) <- c("Non-participant", "Participant", "Post-Consortium", "Participant x Post-Consortium")
names(did_q$coefficients) <- c("Non-participant", "Participant", "Post-Consortium", "Participant x Post-Consortium")
coef_names_price_control <- names(did_price_control$coefficients)
coef_names_price_control[coef_names_price_control == "DFALSE"] <- "Non-participant"
coef_names_price_control[coef_names_price_control == "DTRUE"] <- "Participant"
coef_names_price_control[coef_names_price_control == "Time"] <- "Post-Consortium"
coef_names_price_control[coef_names_price_control == "DTRUE:Time"] <- "Participant x Post-Consortium"
coef_names_price_control[coef_names_price_control == "num_isp"] <- "Number of ISPs"
names(did_price_control$coefficients) <- coef_names_price_control

coef_names_q_control <- names(did_q_control$coefficients)
coef_names_q_control[coef_names_q_control == "DFALSE"] <- "Non-participant"
coef_names_q_control[coef_names_q_control == "DTRUE"] <- "Participant"
coef_names_q_control[coef_names_q_control == "Time"] <- "Post-Consortium"
coef_names_q_control[coef_names_q_control == "DTRUE:Time"] <- "Participant x Post-Consortium"
names(did_q_control$coefficients) <- coef_names_q_control

### Create the table (using code in utils.R) ----
main_table <- create_latex_table(
  models = list(did_price, did_q, did_price_control, did_q_control),
  dep_vars = c("Price", "Bandwidth", "Price", "Bandwidth"),
  model_numbers = 1:4,
  caption = "Estimated Effect of Demand Bundling",
  label = "tab:did_main",
  controls_note = c("School Type", "ISP", "Region", "Service Type"),
  controls_in_columns = c(3,4),  # controls only in last two columns
  table_note = paste("This table reports difference-in-differences estimates from equation (\\ref{eq:did})",
                     "under varying specifications. The analysis is structured as follows:",
                     "Columns (1) and (2) present baseline estimates without control variables, and",
                     "Columns (3) and (4) present estimates with a full set of control variables.")
)

# Write table to file
writeLines(main_table, paste0(dir_savetabs,"did_results.tex"))





## Table 4. ----
did_price_control_CatA <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%   
  filter(CatDSchool==FALSE) %>%
  lm(price ~  + D + Time + D*Time -1 + num_isp + i(vendor) + i(type_agg) + i(region) + i(organization_type), data=.)

did_price_control_CatD <- df_analysis_stats %>%
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  filter(CatDSchool==TRUE) %>%
  lm(price ~ D + Time + D*Time -1 + num_isp + i(vendor) + i(type_agg) + i(region) + i(organization_type), data=.)

did_q_control_CatA <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%   
  filter(CatDSchool==FALSE) %>% 
  lm(total_mbps ~ D + Time + D*Time -1 + i(vendor)+ i(region) + i(type_agg) + i(organization_type), data=.)

did_q_control_CatD <- df_analysis_stats %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%    
  filter(CatDSchool==TRUE) %>% 
  lm(total_mbps ~ D + Time + D*Time -1 + i(vendor)+ i(region) + i(type_agg) + i(organization_type), data=.)


CatAD_table <- create_latex_table(
  models = list(did_price_control_CatA, did_q_control_CatA, did_price_control_CatD,  did_q_control_CatD),
  dep_vars = c("Price", "Bandwidth", "Price",  "Bandwidth"),
  model_numbers = 5:8,
  caption = "Effect of Demand Bundling for Different Internet Products",
  label = "tab:did_cats",
  controls_note = c("School Type", "ISP", "Region", "Service Type"),
  controls_in_columns = c(1,2,3,4),  # controls in all four columns
  column_groups = list(
    list(name = "Category A", columns = 1:2),
    list(name = "Category D", columns = 3:4)
  ),
  table_note = paste("This table reports difference-in-differences estimates from equation (\\ref{eq:did})",
                     "for the two products separately: Category A and Category D (explained in text). All the specifications include school type,
ISP, and region and service type fixed effects. The analysis is structured as follows:",
                     "Columns (5) and (7) present price as dependent variable for Category A and D, respectively.",
                     "Columns (6) and (8) present bandwidth as a dependent variable for Category A and D, respectively.")
)

# Write table to file
writeLines(CatAD_table, paste0(dir_savetabs,"did_results_catAD.tex")) 

## Table 5. ----
# This appears directly in the text because it is derived from different data.

## Table 6 ----
# The code creates a table with several coefficients. In the paper Table 6 we 
# display the coefficients for only the last two rows: Participant x Post-Consortium
# and Potential Bidders

### Create alternative num_isp variable
df_bidders <- df_analysis_stats %>% 
  mutate(potential_bidders = num_isp) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Northeastern",11,potential_bidders)) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Northwestern",6,potential_bidders)) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Southern",10,potential_bidders)) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Central",11,potential_bidders)) %>% 
  select(!num_isp) %>% rename(num_isp = potential_bidders)

df_bidders_complete <- df_analysis_stats %>% 
  mutate(potential_bidders = num_isp) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Northeastern",2,potential_bidders)) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Northwestern",2,potential_bidders)) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Southern",1,potential_bidders)) %>% 
  mutate(potential_bidders = ifelse(D==1 & final_disposition=="Participating" & region=="Central",0,potential_bidders)) %>% 
  select(!num_isp) %>% rename(num_isp = potential_bidders)

did_price_potential_bidders <- df_bidders %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>% 
  filter(CatDSchool==TRUE) %>%
  lm(price ~ D + Time + D*Time -1+ num_isp + i(vendor) + i(type_agg) + i(region) + i(organization_type), data=.)

did_price_potential_bidders_complete <- df_bidders_complete %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>% 
  filter(CatDSchool==TRUE) %>%
  lm(price ~ D + Time + D*Time -1+ num_isp + i(vendor) + i(type_agg) + i(region) + i(organization_type), data=.)


did_potential_table <- create_latex_table(
  models = list(did_price_potential_bidders, did_price_potential_bidders_complete),
  dep_vars = c("Price", "Price"),
  model_numbers = 8:9,
  caption = "Price Effects Controlling for Consortium Competition ",
  label = "tab:competition_bounds",
  # controls_note = c("School Type", "ISP", "Region", "Service Type"),
  # controls_in_columns = c(1,2,3,4),  # controls in all four columns
  # column_groups = list(
  #   list(name = "Category A", columns = 1:2),
  #   list(name = "Category D", columns = 3:4)
  # ),
  table_note = paste("This table presents DiD estimates of price effects for Category D services",
                     "using alternative measures of competition. All the specifications include school type,",
                     "ISP, region, and service type fixed effects. Column (8) uses total bidders (including",
                     "partial coverage bids) as the competition measure. Column (9) uses only complete",
                     "bidders who agreed to serve entire regions. Standard errors in parentheses.")
)

# Write table to file
writeLines(did_potential_table, paste0(dir_savetabs,"did_competition_bound_catD.tex")) 


# Section 5.3 ----

## Figure 3 ----
# !!! Now onwards subset the analysis dataset to only the catD school!!!!
df_analysis_stats_catD <- df_analysis_stats %>% 
  filter(CatDSchool==TRUE) 

### Step 0. re estimate the Price and Q DiD with controls (KEEP the intercept... )  ----
did_price_control_ManskiPepper <- df_analysis_stats_catD %>% 
  mutate(participant = ifelse(final_disposition == "Participating",TRUE,FALSE)) %>%
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(price ~ D + Time + D*Time + num_isp + i(vendor) + i(type)+ i(region) + i(organization_type), data=.)


did_q_control_ManskiPepper <- df_analysis_stats_catD %>% 
  mutate(participant = ifelse(final_disposition == "Participating",TRUE,FALSE)) %>%
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(total_mbps ~ D + Time + D*Time  + i(vendor)+ i(region) + i(organization_type) + i(category_of_service), data=.)

### Step 1: calculate did estimates at different degrees of violations ----
grids <- seq(0, 2.5, by = 0.10) # degrees of violation. g=1 => parallel trend

did_trend_violation <- function(beta, trend){
  output <- sapply(grids, function(g) (beta + trend) - g * trend) # here beta is ATT and (trend -g* trend) measures the violation of parallel trends (bias) except when g=1, then the bias will be 0
  output <- as.numeric(output)
  return(output)
} 

price_coefficients <- coef(did_price_control_ManskiPepper)
price_estimates <- did_trend_violation(price_coefficients["DTRUE:Time"], price_coefficients["Time"])

q_coefficients <- coef(did_q_control_ManskiPepper)
q_estimates <- did_trend_violation(q_coefficients["DTRUE:Time"], q_coefficients["Time"])

### Step 2: calculate standard errors ----
#beta_new =  beta + (1-g*) trend. s.e (beta_new) = sqrt{ (s.e. beta)^2 + (1-g)^2 * (s.e. trend)^2 + 2 * (1-g) * cov( beta, trend)}

did_trend_violation_se <- function(beta_se, trend_se, cov_beta_trend){
  output <- sapply(grids, function(g) sqrt(beta_se^2 + (1-g)^2 * trend_se^2 + 2*(1-g)*cov_beta_trend ))
  output <- as.numeric(output)
  return(output)
} 

price_variance <- vcov(did_price_control)
# price_estimates_se <- did_trend_violation_se(sqrt(price_variance["DTRUE:Time", "DTRUE:Time"]), sqrt(price_variance["Time", "Time"]), price_variance["Time", "DTRUE:Time"])
price_estimates_se <- did_trend_violation_se(sqrt(price_variance["Participant x Post-Consortium", "Participant x Post-Consortium"]), sqrt(price_variance["Post-Consortium", "Post-Consortium"]), price_variance["Participant x Post-Consortium"])

q_variance <- vcov(did_q_control)
# q_estimates_se <- did_trend_violation_se(sqrt(q_variance["DTRUE:Time", "DTRUE:Time"]), sqrt(q_variance["Time", "Time"]), q_variance["Time", "DTRUE:Time"])
q_estimates_se <- did_trend_violation_se(sqrt(q_variance["Participant x Post-Consortium", "Participant x Post-Consortium"]), sqrt(q_variance["Post-Consortium", "Post-Consortium"]), q_variance["Participant x Post-Consortium"])



### Step 3: calculate 95% CI ----
# lower_bound <- estimate - critical_value * se AND upper_bound <- estimate + critical_value * se
did_trend_violation_CI <- function(beta, trend, beta_se, trend_se, cov_beta_trend){
  confidence_level <-0.95
  # Calculate the critical value from the standard normal distribution
  critical_value <- qnorm((1 + confidence_level) / 2) # approx 1.96
  lower_bound <- sapply(grids, function(g) ((beta + trend) - g * trend) - critical_value * (sqrt(beta_se^2 + (1-g)^2 * trend_se^2 + 2*(1-g)*cov_beta_trend )))
  upper_bound <- sapply(grids, function(g) ((beta + trend) - g * trend) + critical_value * (sqrt(beta_se^2 + (1-g)^2 * trend_se^2 + 2*(1-g)*cov_beta_trend )))
  output      <- data.frame("Lower_Bound" = lower_bound, "Upper_Bound" = upper_bound)
  return(output)
} 


price_estimates_CI <- did_trend_violation_CI(price_coefficients["DTRUE:Time"], price_coefficients["Time"], sqrt(price_variance["Participant x Post-Consortium", "Participant x Post-Consortium"]), sqrt(price_variance["Post-Consortium", "Post-Consortium"]), price_variance["Post-Consortium", "Participant x Post-Consortium"])
q_estimates_CI <- did_trend_violation_CI(q_coefficients["DTRUE:Time"], q_coefficients["Time"], sqrt(q_variance["Participant x Post-Consortium", "Participant x Post-Consortium"]), sqrt(q_variance["Post-Consortium", "Post-Consortium"]), q_variance["Post-Consortium", "Participant x Post-Consortium"])


Price_all <- data.frame("Scale" = grids, "Price" = price_estimates, "SE"= price_estimates_se, price_estimates_CI)
Q_all <- data.frame("Scale" = grids, "Q" = q_estimates, "SE"= q_estimates_se, q_estimates_CI)


### Plots

Price_all <- Price_all %>%
  mutate(colorkey="") %>%
  mutate(colorkey = ifelse(Scale==1.0,"#E74C3C","#5A8CAE"))

Q_all <- Q_all %>%
  mutate(colorkey="") %>%
  mutate(colorkey = ifelse(Scale==1.0,"#E74C3C","#5A8CAE"))

price_plot <- create_professional_plot(
  data = Price_all,
  y_var = "Price",
  y_label = "Price Effect",
  # plot_title = "Price Effects by Violation Degree"
)

save_professional_plot(
  plot = price_plot,
  filename = paste0(dir_savefigs, "bounds_price.eps")
)

bandwidth_plot <- create_professional_plot(
  data = Q_all,
  y_var = "Q",
  y_label = "Bandwidth Effect",
  plot_title = "Bandwidth Effects by Violation Degree"
)
save_professional_plot(
  plot = bandwidth_plot,
  filename = paste0(dir_savefigs, "bounds_bandwidth.eps")
)



## Table 7. ----

did_price_active_control <- df_analysis_stats %>% 
  filter(nonactive==0) %>%
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  filter(CatDSchool==TRUE) %>% 
  lm(price ~ D + Time + D*Time + num_isp + i(vendor) + i(type)+ i(region) + i(organization_type) -1, data=.)

did_price_noimpacted_control <- df_analysis_stats %>% 
  filter(final_disposition!="Impacted") %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%    
  filter(CatDSchool==TRUE) %>% 
  lm(price ~ D + Time + D*Time+ num_isp + i(vendor) + i(type)+ i(region) + i(organization_type) -1, data=.)


## Mbps

did_q_active_control <- df_analysis_stats %>% 
  filter(nonactive==0) %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  filter(CatDSchool==TRUE) %>% 
  lm(total_mbps ~ D + Time + D*Time + i(vendor) + i(type)+ i(region) + i(organization_type) -1, data=.)

did_q_noimpacted_control <- df_analysis_stats %>% 
  filter(final_disposition!="Impacted") %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  filter(CatDSchool==TRUE) %>% 
  lm(total_mbps ~ D + Time + D*Time+ i(vendor) + i(type)+ i(region) + i(organization_type) -1, data=.)

## 69 schools (that neither expressed interest in the program nor
#  held contracts with the winning ISP in their region) 

#### First we need to define the schools and menus availabe to non-participants
df_price_schedule <- df_analysis_stats_catD %>% filter(D==1) %>% 
  filter(final_disposition=="Participating") %>% 
  distinct(region, total_mbps, category_of_service, price) %>% 
  arrange(region, category_of_service, total_mbps, price) %>% 
  group_by(region, category_of_service, total_mbps) %>% 
  summarize(winning_price_mean = mean(price), winning_price_med = median(price), 
            winning_price_min = min(price)) %>% 
  mutate(D=1) %>%
  mutate(total_mbps = as.numeric(total_mbps)) %>% 
  ungroup()

df_analysis_w_schedule <- left_join(df_analysis_stats,df_price_schedule, by=c("region","category_of_service","total_mbps","D"))

df_analysis_counterfactual <- df_analysis_w_schedule %>% 
  filter(same_service==TRUE) %>% 
  mutate(new_price_med = ifelse(final_disposition=="Participating",price,winning_price_med),
         new_price_min = ifelse(final_disposition=="Participating",price,winning_price_min), 
         new_price_mean = ifelse(final_disposition=="Participating",price,winning_price_mean),
         new_price_imputed = !is.na(new_price_mean))  %>% 
  group_by(D,final_disposition) %>% 
  summarize(meanprice = mean(price), 
            cf_price_mean = mean(new_price_mean,na.rm=TRUE), 
            cf_count = sum(new_price_imputed),
            count = n())

df_analysis_w_schedule %>% mutate(price_cf_diff = winning_price_mean - price) %>% 
  filter(same_service==TRUE) %>% 
  filter(final_disposition!="Participating") %>%
  group_by(final_disposition) %>% 
  arrange(final_disposition,price_cf_diff) %>% 
  filter(final_disposition=="Not Participating") -> df_final_cf

df_final_cf %>% 
  filter(price_cf_diff<0) %>% 
  summarize(mean(price_cf_diff),median(price_cf_diff))

schools69 <- df_final_cf %>% 
  ungroup() %>% 
  filter(!is.na(winning_price_mean)) %>% 
  select(district_organization_name) %>% 
  mutate(sample69 = TRUE)

df_analysis_stats_69 <- left_join(df_analysis_stats, schools69, by="district_organization_name") %>% 
  filter(CatDSchool==TRUE) %>% 
  mutate(final_sample = sample69==TRUE | participant==TRUE)


### Price 69
did_price_control_69 <- df_analysis_stats_69 %>% 
  filter(final_sample == TRUE) %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(price ~ D + Time + D*Time -1+ num_isp + i(vendor) + i(type_agg) + i(region) + i(organization_type), data=.)

### Quantity 69
did_q_control_69 <- df_analysis_stats_69 %>% 
  filter(final_sample == TRUE) %>% 
  rename(Time=D) %>%  mutate(D=participant)  %>%     
  lm(total_mbps ~ D + Time + D*Time -1  + i(vendor) + i(type_agg) + i(region) + i(organization_type), data=.)



robustness_table_69 <- create_latex_table(
  models = list(did_price_active_control, did_price_noimpacted_control, did_price_control_69,
                did_q_active_control, did_q_noimpacted_control, did_q_control_69),
  dep_vars = c("Price", "Price", "Price", "Bandwidth", "Bandwidth", "Bandwidth"),
  model_numbers = 1:6,
  caption = "Robustness Checks: Alternative Sample Specifications",
  label = "tab:did_robust",
  controls_note = c("School Type", "ISP", "Region", "Service Type"),
  controls_in_columns = 1:6,  # controls in all columns
  table_note = paste("This table presents robustness checks using alternative sample specifications.",
                     "Columns (10) and (13) restrict the sample to active participants only.",
                     "Columns (11) and (14) exclude impacted schools from the analysis.",
                     "Columns (12) and (15) include only the schools defined in Section 5.3.3 as controls.",
                     "All specifications include the full set of control variables and include only Category D schools.")
)
cat(robustness_table_69)

# Write tables to files
writeLines(robustness_table_69, paste0(dir_savetabs,"did_robustness_69.tex"))




## Figure 4


ggplot(df_final_cf, aes(x = winning_price_mean, y = price)) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", alpha = 0.7) +
  geom_point(alpha = 0.5, color = "#5A8CAE", size=3) +
  labs(
    x = "Consortium Price in 2015 ($)",
    y = "Actual Price in 2015 ($)"
  ) +
  # Set both axes to range from 0 to 50
  scale_x_continuous(limits = c(0, 50)) +
  scale_y_continuous(limits = c(0, 50)) +
  theme_minimal(base_size = 24) +
  theme(
    text = element_text(family = "serif", size = 24),
    plot.title = element_text(size = 24, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 24),
    axis.text = element_text(size = 24),
    legend.position = "none",
    axis.line = element_line(color = "black", size = 0.5),
    panel.grid = element_blank(),
    plot.margin = margin(20, 20, 20, 20),
    axis.ticks = element_line(color = "black", size = 0.5),
    axis.ticks.length = unit(3, "pt")
  )

# Save with high resolution
ggsave(
  paste0(dir_savefigs,"price_difference_scatter.pdf"),
  width = 8,
  height = 6,
  dpi = 300,
  device = cairo_pdf
)

### Numbers for the text
df_final_cf %>% count(price_cf_diff>0)
df_final_cf %>% filter(price_cf_diff<=0) %>% summarize(mean = mean(price_cf_diff))
df_final_cf %>% filter(price_cf_diff>0) %>% summarize(mean = mean(price_cf_diff))



# Section 6 ----
## Table 8 ----

df_analysis_stats_catD %>% 
  mutate(erate = as.numeric(erate_discount)) %>% 
  mutate(oop_spend = 12*price*total_mbps*(1-erate), fcc_spend = 12*price*total_mbps*erate, total_spend = 12*price*total_mbps) %>% 
  group_by(participant) %>% 
  filter(D==0) %>% 
  summarize(sum(oop_spend, na.rm=TRUE), sum(fcc_spend,na.rm=TRUE), sum(total_spend, na.rm=TRUE))


price_coefficients <- coef(did_price_control_CatD) # Grab all coefficients
price_DiD <- price_coefficients["DTRUE:Time"]
q_coefficients <- coef(did_q_control_CatD)
q_DiD <- q_coefficients["DTRUE:Time"]

df_erate <- df_analysis_stats_catD %>% 
  arrange(district_organization_name, D) %>%
  group_by(district_organization_name) %>%
  mutate(mbps = as.numeric(total_mbps), erate = as.numeric(erate_discount)) %>%
  filter(D==0) %>%
  mutate(c = mbps * price * erate,
         e = (mbps + q_DiD)*price*(1-erate),
         b = (mbps + q_DiD) * (price + price_DiD) * (1-erate),
         final_comparison = (e-b)/c,
         numerator = (e-b), #numerator is upper bound 
         denom = c) %>%
  # now suppose you keep your Q the same but get lower price, how much savings
  mutate(e2 = (mbps)*price*(1-erate),
         b2 = (mbps) * (price + price_DiD) * (1-erate),
         final_comparison2 = (e2-b2)/c,
         numerator2 = e2-b2 ) %>% #numerator2 is lowerbound
  na.omit()%>% 
  ungroup()



Analysis_erate <- df_erate %>% 
  na.omit(df_erate) %>%
  filter(participant==TRUE) %>%
  summarize(
    mean1 = mean(final_comparison),
    median1 = median(final_comparison),
    std1 = sd(final_comparison),
    iq1 = IQR(final_comparison),
    mean2 = mean(final_comparison2),
    median2 = median(final_comparison2),
    std2 = sd(final_comparison2),
    iq2 = IQR(final_comparison2),
    sum1 = 12*sum(numerator),  #upper bound total 
    sum2 = 12*sum(numerator2), #lower bound total 
    sum_denom = 12*sum(denom)
  )

Analysis_erate

# lower bound total savings is sum2  
Analysis_erate$sum2

# upper bound total savings is sum1   
Analysis_erate$sum1

# total e-rate subsidy is sum_denom 
Analysis_erate$sum_denom

# so the % savings range from 
lower_bound<-(Analysis_erate$sum2/Analysis_erate$sum_denom)*100
print(lower_bound)
#to 
upper_bound<-(Analysis_erate$sum1/Analysis_erate$sum_denom)*100
print(upper_bound)

## Figure 5 ----

### Manski Pepper Applied to Expenditure

# Function to calculate both bounds for given price_DiD and q_DiD values
calculate_bounds <- function(price_DiD_val, q_DiD_val, df) {
  bounds <- df %>%
    mutate(
      e = (mbps + q_DiD_val)*price*(1-erate),
      b = (mbps + q_DiD_val) * (price + price_DiD_val) * (1-erate),
      upper_bound_monthly = (e-b), 
      e2 = (mbps)*price*(1-erate),
      b2 = (mbps) * (price + price_DiD_val) * (1-erate),
      lower_bound_monthly = e2-b2
    ) %>%
    summarize(
      lower_bound = 100*sum(lower_bound_monthly)/sum(total_erate),
      upper_bound = 100*sum(upper_bound_monthly)/sum(total_erate)
    )
  
  return(c(lower_bound = bounds$lower_bound, upper_bound = bounds$upper_bound))
}

# Create the base dataframe for calculation
df_calc <- df_analysis_stats_catD %>% 
  mutate(participant = ifelse(final_disposition == "Participating", TRUE, FALSE)) %>%
  arrange(district_organization_name, D) %>%
  group_by(district_organization_name) %>%
  mutate(mbps = as.numeric(total_mbps), 
         erate = as.numeric(erate_discount)) %>%
  filter(D==0) %>%
  mutate(total_erate = mbps * price * erate) %>% 
  na.omit() %>%
  ungroup() %>%
  filter(participant==TRUE)

# Create the new dataframe with both bounds
df_grids <- data.frame(Scale = grids)
bounds_matrix <- t(mapply(
  calculate_bounds,
  Price_all$Price[match(df_grids$Scale, Price_all$Scale)],
  Q_all$Q[match(df_grids$Scale, Q_all$Scale)],
  MoreArgs = list(df = df_calc)
))

# Combine the results
df_grids$lower_bound <- bounds_matrix[, "lower_bound"]
df_grids$upper_bound <- bounds_matrix[, "upper_bound"]


# Calculate appropriate y-axis breaks
y_min <- min(df_grids$lower_bound)
y_max <- max(df_grids$upper_bound)
# Making breaks every 100 units - adjust this number as needed
break_size <- 100

# Using your existing colors plus a complementary third color for the reference line
manski_pepper_expenditure <- ggplot(df_grids, aes(x = Scale)) +
  # Connecting line between bounds
  geom_segment(
    aes(x = Scale, xend = Scale, y = lower_bound, yend = upper_bound,
        color = Scale == 1.0),
    size = 1
  ) +
  scale_color_manual(values = c("FALSE" = "#7F8C8D", "TRUE" = "#E74C3C")) +
  # Lower bounds points
  geom_point(
    aes(y = lower_bound, color = Scale == 1.0, fill = Scale == 1.0), 
    size = 3.5,
    shape = 21,
    stroke = 0.5
  ) +
  geom_point(
    aes(y = upper_bound, color = Scale == 1.0, fill = Scale == 1.0),
    size = 3.5,
    shape = 21,
    stroke = 0.5
  ) +
  scale_color_manual(values = c("FALSE" = "#5A8CAE", "TRUE" = "#E74C3C")) +
  scale_fill_manual(values = c("FALSE" = "#5A8CAE", "TRUE" = "#E74C3C")) +
  # Labels and theme
  labs(
    x = expression(atop("Deviation from Parallel Trends (g)",
                        atop(italic("(g = 1.0 implies parallel trends)"), ""))),
    y = "Savings (% of E-rate Subsidy)"
  ) +
  theme_minimal(base_size = 24) +
  theme(
    text = element_text(family = "serif"),
    plot.title = element_text(size = 24, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 24),
    axis.text = element_text(size = 24),
    legend.position = "none",
    panel.grid.minor = element_blank(),
    panel.grid.major = element_line(color = "grey90"),
    plot.margin = margin(20, 20, 20, 20)
  ) +
  scale_y_continuous(
    breaks = seq(
      floor(y_min/break_size) * break_size, 
      ceiling(y_max/break_size) * break_size, 
      by = break_size
    )
  )

manski_pepper_expenditure

# Optional: Save with high resolution
ggsave(
  paste0(dir_savefigs,"Manski_Expenditure.pdf"),
  width = 8,
  height = 6,
  dpi = 300,
  device = cairo_pdf
)

## Figure 6 ----
Welfare_bounds <- df_analysis_stats_catD %>%
  filter(change_transport==FALSE) %>%
  mutate(erate_discount = as.numeric(erate_discount))%>%
  group_by(district_organization_name) %>%
  filter(!any(is.na(erate_discount))) %>%  # Keep groups where erate is NaN
  arrange(D) %>%
  mutate(cs_low = total_mbps[1]*(price[1]*(1-erate_discount[1])-price[2]*(1-erate_discount[2]))/10000) %>%
  mutate(RHS1=price[1]*(1-erate_discount[1])-price[2]*(1-erate_discount[2])) %>% 
  mutate(RHS2 = total_mbps[2]-total_mbps[1]) %>% 
  mutate(RHS3 = log(total_mbps[2]/total_mbps[1])) %>%
  mutate(cs_high = (RHS1*RHS2)/(RHS3*10000))%>%
  mutate(cs_high = ifelse(price_change=="decrease" & bandwidth_change=="unchanged",cs_low,cs_high)) %>% 
  filter(!(price_change == "unchanged" & bandwidth_change == "Decrease") & !(price_change == "increase" & bandwidth_change == "unchanged")) %>%
  mutate(cs_interval = cs_high-cs_low)%>%
  filter(D == 1)%>%
  mutate(gain_loss=ifelse(cs_low >= 0, "gain", "loss"))%>%
  mutate(cs_interval = ifelse(gain_loss=="loss",-cs_interval,cs_interval)) %>% 
  filter (bandwidth_change != "unchanged") %>% 
  ungroup()%>%
  arrange(cs_high) 


Welfare_bounds_participating <- as.data.frame(Welfare_bounds) %>%
  filter(final_disposition=="Participating") %>%
  mutate(id=row_number())



Welfare_bounds_Not_participating <- as.data.frame(Welfare_bounds) %>%
  filter(final_disposition=="Not Participating") %>%
  mutate(id=row_number())

CS_bounds_p <- Welfare_bounds_participating %>%
  ggplot(., aes(x = id)) +
  # Horizontal grid lines for easier tracking
  geom_hline(
    yintercept = seq(-1.5, 4.5, by = 0.5),
    color = "grey95",
    size = 0.5
  ) +
  # Error bars
  geom_errorbar(
    aes(ymin = cs_low, ymax = cs_high),
    width = 0.2,
    size = 0.8,
    color = "#7F8C8D"
  ) +
  # Lower bound points
  geom_point(
    aes(y = cs_low),
    color = "#E74C3C",
    size = 1,
    fill = "#E74C3C",
    shape = 21,
    stroke = 0.4
  ) +
  # Upper bound points
  geom_point(
    aes(y = cs_high),
    color = "#5A8CAE",
    size = 1,
    fill = "#5A8CAE",
    shape = 21,
    stroke = 0.4
  ) +
  # Reference line (on top of grid lines)
  geom_hline(
    yintercept = 0,
    linetype = "dashed",
    color = "#2C3E50",
    size = 0.8
  ) +
  labs(
    # title = "Welfare Changes for Participating Schools",
    x = NULL,    # Removed x-axis title
    y = "Change in Welfare ($10,000/month)"
  ) +
  theme_minimal(base_size = 20) +
  theme(
    text = element_text(family = "serif"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 20),
    axis.text = element_text(size = 20),
    axis.text.x = element_blank(),  # Removed x-axis text
    legend.position = "none",
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "grey90"),
    plot.margin = margin(20, 20, 20, 20)
  ) +
  scale_y_continuous(
    limits = c(-1.0, 4.0),
    breaks = seq(-1.5, 4.5, by = 0.5)
  )
CS_bounds_p

ggsave(
  paste0(dir_savefigs,"CS_interval_participating.pdf"),
  width = 8,
  height = 6,
  dpi = 300,
  device = cairo_pdf
)

CS_bounds_np <- Welfare_bounds_Not_participating %>%
  ggplot(., aes(x = id)) +
  # Horizontal grid lines for easier tracking
  geom_hline(
    yintercept = seq(-1.5, 4.5, by = 0.5),
    color = "grey95",
    size = 0.5
  ) +
  # Error bars
  geom_errorbar(
    aes(ymin = cs_low, ymax = cs_high),
    width = 0.2,
    size = 0.8,
    color = "#7F8C8D"
  ) +
  # Lower bound points
  geom_point(
    aes(y = cs_low),
    color = "#E74C3C",
    size = 1,
    fill = "#E74C3C",
    shape = 21,
    stroke = 0.4
  ) +
  # Upper bound points
  geom_point(
    aes(y = cs_high),
    color = "#5A8CAE",
    size = 1,
    fill = "#5A8CAE",
    shape = 21,
    stroke = 0.4
  ) +
  # Reference line (on top of grid lines)
  geom_hline(
    yintercept = 0,
    linetype = "dashed",
    color = "#2C3E50",
    size = 0.8
  ) +
  labs(
    # title = "Welfare Changes for Participating Schools",
    x = NULL,    # Removed x-axis title
    y = "Change in Welfare ($10,000/month)"
  ) +
  theme_minimal(base_size = 20) +
  theme(
    text = element_text(family = "serif"),
    plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 20),
    axis.text = element_text(size = 20),
    axis.text.x = element_blank(),  # Removed x-axis text
    legend.position = "none",
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "grey90"),
    plot.margin = margin(20, 20, 20, 20)
  ) +
  scale_y_continuous(
    limits = c(-1.0, 4.0),
    breaks = seq(-1.5, 4.5, by = 0.5)
  )
CS_bounds_np

ggsave(
  paste0(dir_savefigs,"CS_interval_NOT_participating.pdf"),
  width = 8,
  height = 6,
  dpi = 300,
  device = cairo_pdf
)

## Figure 7 ----

DiD_Welfare_bounds <- df_analysis_stats_catD %>%
  mutate(erate_discount = as.numeric(erate_discount))%>%
  group_by(district_organization_name) %>%
  filter(!any(is.na(erate_discount))) %>%  # Keep groups where erate is NaN
  arrange(D) %>%
  mutate(cs_low = total_mbps[1]*(price[1]*(1-erate_discount[1])-(price[1]+price_DiD)*(1-erate_discount[2]))) %>%
  mutate(RHS1=price[1]*(1-erate_discount[1])-(price[1]+price_DiD)*(1-erate_discount[2])) %>% 
  mutate(RHS2 = ((total_mbps[1]+q_DiD)-total_mbps[1])) %>% 
  mutate(RHS3 = log((total_mbps[1]+q_DiD)/total_mbps[1])) %>%
  mutate(cs_high = (RHS1*RHS2)/(RHS3))%>%
  mutate(cs_high = ifelse(price_change=="decrease" & bandwidth_change=="unchanged",cs_low,cs_high)) %>% 
  filter(!(price_change == "unchanged" & bandwidth_change == "Decrease") & !(price_change == "increase" & bandwidth_change == "unchanged")) %>%
  mutate(cs_interval = cs_high-cs_low)%>%
  filter(D == 1)%>%
  mutate(gain_loss=ifelse(cs_low >= 0, "gain", "loss"))%>%
  mutate(cs_interval = ifelse(gain_loss=="loss",-cs_interval,cs_interval)) %>% 
  filter (bandwidth_change != "unchanged") %>% 
  ungroup()%>%
  arrange(cs_high)


DiD_Welfare_bounds_participating <- as.data.frame(DiD_Welfare_bounds) %>%
  filter(final_disposition=="Participating") %>%
  mutate(id=row_number())

## Hard coded coefficients  (these are actually g=0.5)
price_Did_g_0 = -11.905846
q_DiD_g_0 = 272.3005

Manski_Welfare_bounds_g_0 <- df_analysis_stats_catD %>%
  mutate(erate_discount = as.numeric(erate_discount))%>%
  group_by(district_organization_name) %>%
  filter(!any(is.na(erate_discount))) %>%  # Keep groups where erate is NaN
  arrange(D) %>%
  mutate(cs_low = total_mbps[1]*(price[1]*(1-erate_discount[1])-(price[1]+price_Did_g_0)*(1-erate_discount[2]))) %>%
  mutate(RHS1=price[1]*(1-erate_discount[1])-(price[1]+price_Did_g_0)*(1-erate_discount[2])) %>% 
  mutate(RHS2 = ((total_mbps[1]+q_DiD_g_0)-total_mbps[1])) %>% 
  mutate(RHS3 = log((total_mbps[1]+q_DiD_g_0)/total_mbps[1])) %>%
  mutate(cs_high = (RHS1*RHS2)/(RHS3))%>%
  mutate(cs_high = ifelse(price_change=="decrease" & bandwidth_change=="unchanged",cs_low,cs_high)) %>% 
  filter(!(price_change == "unchanged" & bandwidth_change == "Decrease") & !(price_change == "increase" & bandwidth_change == "unchanged")) %>%
  mutate(cs_interval = cs_high-cs_low)%>%
  filter(D == 1)%>%
  mutate(gain_loss=ifelse(cs_low >= 0, "gain", "loss"))%>%
  mutate(cs_interval = ifelse(gain_loss=="loss",-cs_interval,cs_interval)) %>% 
  filter (bandwidth_change != "unchanged") %>% 
  ungroup()%>%
  arrange(cs_high)

Manski_Welfare_bounds_participating_g_0 <- as.data.frame(Manski_Welfare_bounds_g_0) %>%
  filter(final_disposition=="Participating") %>%
  mutate(id=row_number())



# these are actually g=1.5
price_Did_g_1n75 = -6.516669
q_DiD_g_1n75 = 204.7026

Manski_Welfare_bounds_g_1n75 <- df_analysis_stats_catD %>%
  mutate(erate_discount = as.numeric(erate_discount))%>%
  group_by(district_organization_name) %>%
  filter(!any(is.na(erate_discount))) %>%  # Keep groups where erate is NaN
  arrange(D) %>%
  mutate(cs_low = total_mbps[1]*(price[1]*(1-erate_discount[1])-(price[1]+price_Did_g_1n75)*(1-erate_discount[2]))) %>%
  mutate(RHS1=price[1]*(1-erate_discount[1])-(price[1]+price_Did_g_1n75)*(1-erate_discount[2])) %>% 
  mutate(RHS2 = ((total_mbps[1]+q_DiD_g_1n75)-total_mbps[1])) %>% 
  mutate(RHS3 = log((total_mbps[1]+q_DiD_g_1n75)/total_mbps[1])) %>%
  mutate(cs_high = (RHS1*RHS2)/(RHS3))%>%
  mutate(cs_high = ifelse(price_change=="decrease" & bandwidth_change=="unchanged",cs_low,cs_high)) %>% 
  filter(!(price_change == "unchanged" & bandwidth_change == "Decrease") & !(price_change == "increase" & bandwidth_change == "unchanged")) %>%
  mutate(cs_interval = cs_high-cs_low)%>%
  filter(D == 1)%>%
  mutate(gain_loss=ifelse(cs_low >= 0, "gain", "loss"))%>%
  mutate(cs_interval = ifelse(gain_loss=="loss",-cs_interval,cs_interval)) %>% 
  filter (bandwidth_change != "unchanged") %>% 
  ungroup()%>%
  arrange(cs_high)

Manski_Welfare_bounds_participating_g_1n75 <- as.data.frame(Manski_Welfare_bounds_g_1n75) %>%
  filter(final_disposition=="Participating") %>%
  # filter(cs_high>=0)%>%
  mutate(id=row_number())


# First, let's combine all the data into one dataframe with a g indicator
combined_bounds <- bind_rows(
  DiD_Welfare_bounds_participating %>% 
    mutate(g = "Baseline"),
  Manski_Welfare_bounds_participating_g_0 %>% 
    mutate(g = "g = 0.5"),
  Manski_Welfare_bounds_participating_g_1n75 %>% 
    mutate(g = "g = 1.5")
) %>%
  # Ensure g is a factor with desired order
  mutate(g = factor(g, levels = c("g = 0.5", "Baseline", "g = 1.5"))) %>% 
  mutate(cs_low = cs_low/10000, cs_high = cs_high/10000) %>% 
  group_by(g) %>%
  mutate(x_spaced = row_number() * 1.1) %>%
  ungroup()

# Create plot with gapped transparent bars
gapped_bars <- ggplot(combined_bounds, aes(x = x_spaced, color = g, fill = g)) +
  # Add bars
  geom_linerange(
    aes(ymin = cs_low, ymax = cs_high),
    size = 1.5,
    alpha = 0.4,
    position = position_identity()
  ) +
  # Reference line
  geom_hline(
    yintercept = 0,
    linetype = "dashed",
    color = "#2C3E50",
    size = 0.8
  ) +
  # Custom colors
  scale_color_manual(
    values = c(
      "g = 0.5" = "#E74C3C",
      "Baseline" = "#5A8CAE",
      "g = 1.5" = "#2ECC71"
    ),
    name = "Parallel Trends\nViolation Factor"
  ) +
  scale_fill_manual(
    values = c(
      "g = 0.5" = "#E74C3C",
      "Baseline" = "#5A8CAE",
      "g = 1.5" = "#2ECC71"
    )
  ) +
  labs(
    y = "Change in Welfare ($10,000/month)",
    x = NULL
  ) +
  theme_minimal(base_size = 20) +
  theme(
    text = element_text(family = "serif"),
    axis.title = element_text(size = 20),
    axis.text = element_text(size = 20),
    axis.text.x = element_blank(),
    legend.position = "right",
    legend.title = element_text(size = 16),
    legend.text = element_text(size = 16),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "grey90"),
    plot.margin = margin(20, 20, 20, 20)
  ) +
  scale_y_continuous(
    limits = c(-0.5, 2.0),
    breaks = seq(-0.5, 2.0, by = 0.5)
  )

gapped_bars


# Save the plot
ggsave(
  paste0(dir_savefigs,"combined_welfare_bounds.pdf"),
  gapped_bars,
  width = 10,
  height = 6,
  dpi = 300
)


# Appendix ----

## Table A.1 ----

create_table_isps <- function(df) {
  df %>%
    mutate(Year = if_else(D == 0, 2014, 2015)) %>%
    add_count(vendor, Year, name = "total_schools") %>%
    mutate(vendor_agg = if_else(total_schools >= 6, vendor, "Other")) %>%
    count(vendor_agg, Year, region, name = "count") %>%
    pivot_wider(names_from = c(Year, region), values_from = count, values_fill = 0) %>%
    mutate(
      Total_2014 = rowSums(select(., `2014_Northeastern`:`2014_Northwestern`)),
      Total_2015 = rowSums(select(., `2015_Northeastern`:`2015_Northwestern`))
    ) %>%
    arrange(desc(Total_2014 + Total_2015)) %>%
    arrange(vendor_agg == "Other", row_number())
}

# Create the table
isp_region_table <- create_table_isps(df_analysis_clean_names)  # use the clean_names df instead of the final one

# Calculate column totals
totals <- isp_region_table %>%
  summarise(across(where(is.numeric), sum)) %>%
  mutate(vendor_agg = "Total")

# Combine the original data with totals
isp_region_table <- bind_rows(isp_region_table, totals)

# Create the table
latex_output <- create_latex_table_isp_region(isp_region_table)

# Print to console
cat(latex_output)

# Optionally write to file
writeLines(latex_output, paste0(dir_savetabs, "tab_isp_2014_2015.tex"))

## Table A.2

# Modified function to calculate only means
calculate_sumstats_means <- function(data, ...) {
  data %>% 
    summarize(
      mean_price = mean(price),
      mean_mbps = mean(total_mbps),
      mean_fiber = mean(fiber), 
      mean_coaxial = mean(coaxial),
      mean_other = mean(other),
      mean_catD = mean(CatDSchool),
      .groups = "drop"
    )
}

# Calculate summary stats grouped by D, region, and participant
sumstats_D_region_participant <- df_analysis_stats %>% 
  mutate(fiber = ifelse(type %in% c("FiOS", "Ethernet"), 1, 0),
         coaxial = ifelse(type %in% c("Cable Modem"), 1, 0),
         other = ifelse(type %in% c("Not Reported", "Wireless", "Dedicated access T-1 or DS-3", "DSL"), 1, 0)) %>% 
  group_by(D, region, participant) %>% 
  calculate_sumstats_means() %>%
  mutate(region = as.character(region),
         participant = as.character(participant))

# Split the data into pre (D=0) and post (D=1) dataframes
pre_data <- sumstats_D_region_participant %>%
  filter(D == 0) %>%
  select(-D) %>%
  pivot_longer(cols = -c(region, participant), 
               names_to = "variable", 
               names_prefix = "mean_", 
               values_to = "value") %>%
  pivot_wider(names_from = region, 
              values_from = value,
              names_prefix = "Pre_")

post_data <- sumstats_D_region_participant %>%
  filter(D == 1) %>%
  select(-D) %>%
  pivot_longer(cols = -c(region, participant), 
               names_to = "variable", 
               names_prefix = "mean_", 
               values_to = "value") %>%
  pivot_wider(names_from = region, 
              values_from = value,
              names_prefix = "Post_")

# Merge pre and post data
table_data <- pre_data %>%
  full_join(post_data, by = c("participant", "variable")) %>%
  mutate(
    variable = factor(variable, levels = c("price", "mbps", "fiber", "coaxial", "other", "catD")),
    participant = factor(participant, levels = c("TRUE", "FALSE"))
  ) %>%
  arrange(variable, participant)

# Function to format numbers to 2 decimal places
format_number <- function(x) {
  ifelse(is.na(x), "--", sprintf("%.2f", x))
}

# Prepare the data
table_data <- table_data %>%
  mutate(across(-c(variable, participant), format_number)) %>%
  mutate(
    variable = case_when(
      variable == "price" ~ "Price",
      variable == "mbps" ~ "Bandwidth",
      variable == "fiber" ~ "Fiber",
      variable == "coaxial" ~ "Coaxial",
      variable == "other" ~ "Other",
      variable == "catD" ~ "Category D",
      TRUE ~ variable
    ),
    participant = case_when(
      participant == "TRUE" ~ "Participant",
      participant == "FALSE" ~ "Non-participant"
    )
  )

# Get region names
region_names <- levels(factor(sumstats_D_region_participant$region))
region_names_display <- c("Cent.","N.E.","N.W.","South")

make_latex_table <- function(data, region_names, region_names_display, caption = "Summary Statistics by Region") {
  # Start table environment
  table_str <- "\\begin{table}[!htbp]\n\n"
  
  # Caption
  table_str <- paste0(table_str, "\\caption{\\label{tab:sumstats_region} ", caption, "}\n")
  
  table_str <- paste0(table_str, "\\hspace{-0.3in}\\begin{threeparttable}\n")
  
  # Table header
  table_str <- paste0(table_str, 
                      "\\begin{tabular}{lll", paste(rep("c", length(region_names) * 2), collapse=""), "}\n",
                      "\\toprule\n")
  table_str <- paste0(table_str, 
                      " & & \\multicolumn{4}{c}{Pre Consortium (2014)} & \\multicolumn{4}{c}{Post Consortium (2015)}\\\\\n")
  
  # Column headers
  headers <- c("Outcome", "Participant", 
               region_names_display, # Pre consortium names
               region_names_display) # Post consortium names
  table_str <- paste0(table_str,
                      paste(headers, collapse=" & "), " \\\\\n\\midrule\n")
  
  # Data rows
  for(i in 1:nrow(data)) {
    row <- data[i,]
    row_str <- paste(row, collapse=" & ")
    table_str <- paste0(table_str, row_str, " \\\\\n")
  }
  
  # Footer
  table_str <- paste0(table_str, 
                      "\\bottomrule\n\\end{tabular}\n",
                      "\\begin{tablenotes}[flushleft]\n\\small\n",
                      "\\item[1] This table presents means of key contract characteristics by region and consortium participation status. Regions are Northeast (N.E.), Central (Cent.), South (South), and Northwest (N.W.).  Price is measured in dollars per Mbps per month, and bandwidth is measured in Mbps. Fiber, Coaxial, and Other are indicator variables for connection type that sum to one within each region-participant-year cell. Category D is an indicator for the service type.\n",
                      "\\end{tablenotes}\n",
                      "\\end{threeparttable}\n\\end{table}\n")
  
  return(table_str)
}
latex_str <- make_latex_table(table_data, region_names, region_names_display)
cat(latex_str)
writeLines(latex_str, paste0(dir_savetabs,"tab_sumstats_region.tex"))



